package com.demo;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;

import com.demo.biz.ExamService;

public class ExamCrawler extends BreadthCrawler {

	public ExamCrawler(String crawlPath, boolean autoParse) {
		super(crawlPath, autoParse);
		/* start page */
		//this.addSeed("http://www.tiku.com/testPaper.html?hdSearch=&key=&sct=0&st=0&cid=500014&ftid=&sdid=&tdid=&bid=800104&vid=800016&uid=800417&cptid=&kid=&difficulty=&qtid=600122&sort=0&page=1");
		/* fetch url like http://news.hfut.edu.cn/show-xxxxxxhtml */
		//this.addRegex("http://www.tiku.com/question.html.*");
		/* do not fetch jpg|png|gif */
		// this.addRegex("-.*\\.(jpg|png|gif).*");
		/* do not fetch url contains # */
		// this.addRegex("-.*#.*");
	}
	
	public ExamCrawler(String crawlPath, boolean autoParse, int maxPageNum){
		super(crawlPath, autoParse);
		for (int pageNum = 1; pageNum <= maxPageNum; pageNum++){
			String url = createUrl(pageNum);
			CrawlDatum datum = new CrawlDatum(url).meta("page", pageNum+ "");
			addSeed(datum);
		}
		this.addRegex("http://www.tiku.com/question.html.*");
	}

	@Override
	public void visit(Page page, CrawlDatums next) {
		String url = page.getUrl();
		/* if page is news page */
		if (url.contains("question.html")) {
			
			ExamService examService = new ExamService(page);
			examService.insert();
			
		}
	}

	public static void main(String[] args) throws Exception {
		ExamCrawler crawler = new ExamCrawler("crawl", true, 3);
		crawler.setThreads(5);
		crawler.setExecuteInterval(1000);
		crawler.start(4);
		// crawler.setTopN(10);
		// crawler.setResumable(true);
		/* start crawl with depth of 4 */
		
	}

	public static String createUrl(int pageNum) {
		int first = pageNum;
		//String url1 = "http://www.tiku.com/testPaper.html?hdSearch=&key=&sct=0&st=0&cid=500014&ftid=&sdid=&tdid=&bid=800104&vid=800016&uid=800417&cptid=&kid=&difficulty=&qtid=600122&sort=0&page=%s";
		//String url2 = "http://www.tiku.com/testPaper.html?sct=0&st=0&cid=500014&bid=800104&vid=800016&uid=800418&cptid=&kid=&qtid=600122&sort=0&difficulty=1&page=%s";
		//String url3 = "http://www.tiku.com/testPaper.html?sct=0&cn=&st=0&cid=500014&bid=800104&vid=800016&uid=800421&qtid=600122&difficulty=&sort=0&page=%s";
		String url4 = "http://www.tiku.com/testPaper.html?sct=0&cn=&st=0&cid=500014&bid=800104&vid=800016&uid=800435&qtid=600122&difficulty=&sort=0&page=%s";
		
		return String
				.format(url4, first);
	}

}
